<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>

    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <meta content="Cask Data, Inc." name="author" />
<meta content="Copyright © 2014-2017 Cask Data, Inc." name="copyright" />


    <meta name="git_release" content="6.1.1">
    <meta name="git_hash" content="05fbac36f9f7aadeb44f5728cea35136dbc243e5">
    <meta name="git_timestamp" content="2020-02-09 08:22:47 +0800">
    <title>FileSet Dataset</title>

    <link rel="stylesheet" href="../../_static/cdap-bootstrap.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/bootstrap-3.3.6/css/bootstrap.min.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/bootstrap-3.3.6/css/bootstrap-theme.min.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/css/bootstrap-sphinx.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/css/cdap-dynamicscrollspy-4.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/css/jquery.mCustomScrollbar.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/css/cdap-jquery.mCustomScrollbar.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/css/abixTreeList-2.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/cdap-bootstrap.css" type="text/css" />

    <script type="text/javascript">
      var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    '',
        VERSION:     '6.1.1',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  false
      };
    </script>
    <script type="text/javascript" src="../../_static/jquery.js"></script>
    <script type="text/javascript" src="../../_static/underscore.js"></script>
    <script type="text/javascript" src="../../_static/doctools.js"></script>
    <script type="text/javascript" src="../../_static/language_data.js"></script>

    <link rel="shortcut icon" href="../../_static/favicon.ico"/>
    <link rel="index" title="Index" href="../../genindex.html" />
    <link rel="search" title="Search" href="../../search.html" />
    <link rel="top" title="Cask Data Application Platform 6.1.1 Documentation" href="../../index.html" />
    <link rel="up" title="Datasets" href="index.html" />
    <link rel="next" title="Partitioned FileSet" href="partitioned-fileset.html" />
    <link rel="prev" title="Tables" href="table.html" />
    <!-- block extrahead -->
    <meta charset='utf-8'>
    <meta http-equiv='X-UA-Compatible' content='IE=edge,chrome=1'>
    <meta name='viewport' content='width=device-width, initial-scale=1.0, maximum-scale=1'>
    <meta name="apple-mobile-web-app-capable" content="yes">
    <!-- block extrahead end -->

</head>
<body role="document">

<!-- block navbar -->
<div id="navbar" class="navbar navbar-inverse navbar-default navbar-fixed-top">
    <div class="container-fluid">
      <div class="row">
        <div class="navbar-header">
          <!-- .btn-navbar is used as the toggle for collapsed navbar content -->
          <a class="navbar-brand" href="../../table-of-contents/../../index.html">
            <span><img alt="CDAP logo" src="../../_static/cdap_logo.svg"/></span>
          </a>

          <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".nav-collapse">
            <span class="icon-bar"></span>
            <span class="icon-bar"></span>
            <span class="icon-bar"></span>
          </button>

          <div class="pull-right">
            <div class="dropdown version-dropdown">
              <a href="#" class="dropdown-toggle" data-toggle="dropdown"
                role="button" aria-haspopup="true" aria-expanded="false">
                v 6.1.1 <span class="caret"></span>
              </a>
              <ul class="dropdown-menu">
                <li><a href="//docs.cdap.io/cdap/5.1.2/en/index.html">v 5.1.2</a></li>
                <li><a href="//docs.cdap.io/cdap/4.3.4/en/index.html">v 4.3.4</a></li>
              </ul>
            </div>
          </div>
          <form class="navbar-form navbar-right navbar-search" action="../../search.html" method="get">
            <div class="form-group">
              <div class="navbar-search-image material-icons"></div>
              <input type="text" name="q" class="form-control" placeholder="  Search" />
            </div>
            <input type="hidden" name="check_keywords" value="yes" />
            <input type="hidden" name="area" value="default" />
          </form>

          <div class="collapse navbar-collapse nav-collapse navbar-right navbar-navigation">
            <ul class="nav navbar-nav"><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../../table-of-contents/../../index.html">简介</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link current" href="../../table-of-contents/../../guides.html">手册</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../../table-of-contents/../../reference-manual/index.html">参考</a></li><li class="docsite-nav-tab-container"><a class="docsite-nav-tab-link " href="../../table-of-contents/../../faqs/index.html">帮助</a></li>
            </ul>
          </div>

        </div>
      </div>
    </div>
  </div><!-- block navbar end -->
<!-- block main content -->
<div class="main-container container">
  <div class="row"><div class="col-md-2">
      <div id="sidebar" class="bs-sidenav scrollable-y-outside" role="complementary">
<!-- theme_manual: developer-manual -->
<!-- theme_manual_highlight: guides -->
<!-- sidebar_title_link: ../../table-of-contents/../../guides.html -->

  <div role="note" aria-label="manuals links"><h3><a href="../../table-of-contents/../../guides.html">Guides</a></h3>

    <ul class="this-page-menu">
      <li class="toctree-l1"><a href="../../table-of-contents/../../user-guide/index.html" rel="nofollow">用户手册</a>
      </li>
      <li class="toctree-l1"><b><a href="../../table-of-contents/../../developer-manual/index.html" rel="nofollow">开发手册</a></b>
      <nav class="pagenav">
      <ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../../index.html"> 简介</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../getting-started/index.html"> 入门指南</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../getting-started/sandbox/index.html">CDAP Sandbox</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../getting-started/sandbox/zip.html">二进制 Zip 文件</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../getting-started/sandbox/zip.html#cdap-sandbox">启动和停止 CDAP Sandbox</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../getting-started/sandbox/virtual-machine.html">虚拟机镜像</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../getting-started/sandbox/docker.html">Docker 镜像</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../getting-started/quick-start.html">快速入门</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../getting-started/dev-env.html">搭建开发环境</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../getting-started/start-stop-cdap.html">启动和停止 CDAP</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../getting-started/building-apps.html">构建并运行应用</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../overview/index.html"> 概述</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../overview/anatomy.html"> 大数据应用剖析</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../overview/modes.html"> 模式和组件</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../overview/abstractions.html"> 核心概念</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../overview/interfaces.html"> 编程接口</a></li>
</ul>
</li>
<li class="toctree-l1 current"><a class="reference internal" href="../index.html"> 抽象概念</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../core.html"> Core Abstractions</a></li>
<li class="toctree-l2"><a class="reference internal" href="../applications.html"> Applications</a></li>
<li class="toctree-l2 current"><a class="reference internal" href="index.html"> Datasets</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="overview.html"> Overview</a></li>
<li class="toctree-l3"><a class="reference internal" href="table.html"> Table API</a></li>
<li class="toctree-l3 current"><a class="current reference internal" href="#"> FileSets</a></li>
<li class="toctree-l3"><a class="reference internal" href="partitioned-fileset.html"> Partitioned FileSets</a></li>
<li class="toctree-l3"><a class="reference internal" href="time-partitioned-fileset.html"> TimePartitioned FileSets</a></li>
<li class="toctree-l3"><a class="reference internal" href="system-custom.html"> System and Custom Datasets</a></li>
<li class="toctree-l3"><a class="reference internal" href="permissions.html"> Dataset Permissions</a></li>
<li class="toctree-l3"><a class="reference internal" href="cube.html"> Cube Dataset</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../mapreduce-programs.html"> MapReduce Programs</a></li>
<li class="toctree-l2"><a class="reference internal" href="../plugins.html"> Plugins</a></li>
<li class="toctree-l2"><a class="reference internal" href="../schedules.html"> Schedules</a></li>
<li class="toctree-l2"><a class="reference internal" href="../secure-keys.html"> Secure Keys</a></li>
<li class="toctree-l2"><a class="reference internal" href="../services.html"> Services</a></li>
<li class="toctree-l2"><a class="reference internal" href="../spark-programs.html"> Spark Programs</a></li>
<li class="toctree-l2"><a class="reference internal" href="../workers.html"> Workers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../workflows.html"> Workflows</a></li>
<li class="toctree-l2"><a class="reference internal" href="../artifacts.html"> Artifacts</a></li>
<li class="toctree-l2"><a class="reference internal" href="../program-lifecycle.html"> Program Lifecycle</a></li>
<li class="toctree-l2"><a class="reference internal" href="../namespaces.html"> Namespaces</a></li>
<li class="toctree-l2"><a class="reference internal" href="../transaction-system.html"> Transaction System</a></li>
<li class="toctree-l2"><a class="reference internal" href="../transactional-messaging-system.html"> Transactional Messaging System</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../metadata/index.html"> 元数据</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../metadata/system-metadata.html"> System Metadata</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../metadata/discovery-lineage.html"> Discovery and Lineage</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../metadata/field-lineage.html"> Field Level Lineage</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../metadata/audit-logging.html"> Audit Logging</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../metadata/metadata-ui.html"> CDAP Metadata UI</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../metadata/programmatic-metadata.html"> Accessing metadata programmatically</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../pipelines/index.html"> 数据流管道</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/concepts-design.html"> Concepts and Design</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/getting-started.html"> Getting Started</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/studio.html"> CDAP Studio</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/creating-pipelines.html"> Creating Pipelines</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/running-pipelines.html"> Running Pipelines</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/plugin-management.html"> Plugin Management</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/plugins/index.html"> Plugin Reference</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/plugins/actions/index.html"> Action Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/plugins/sources/index.html"> Source Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/plugins/transforms/index.html"> Transform Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/plugins/analytics/index.html"> Analytic Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/plugins/sinks/index.html"> Sink Plugins</a><ul class="simple">
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/plugins/shared-plugins/index.html"> Shared Plugins</a><ul>
<li class="toctree-l4"><a class="reference internal" href="../../pipelines/plugins/shared-plugins/core.html">CoreValidator</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/plugins/post-run-plugins/index.html"> Post-run Plugins</a><ul class="simple">
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/developing-pipelines.html"> Developing Pipelines</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/developing-plugins/index.html"> Developing Plugins</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/developing-plugins/plugin-basics.html">Plugin Basics</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/developing-plugins/creating-a-plugin.html">Creating a Plugin</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/developing-plugins/presentation-plugins.html">Plugin Presentation</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/developing-plugins/testing-plugins.html">Testing Plugins</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../pipelines/developing-plugins/packaging-plugins.html">Packaging Plugins</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../pipelines/how-cdap-pipelines-work.html"> How CDAP Pipelines Work</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../cloud-runtimes/index.html"> 云平台运行</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../cloud-runtimes/concepts/index.html"> Concepts</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../cloud-runtimes/provisioners/index.html"> Provisioners</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../cloud-runtimes/provisioners/gcp-dataproc.html">Google Dataproc</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../cloud-runtimes/provisioners/aws-emr.html">Amazon Elastic MapReduce</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../cloud-runtimes/provisioners/remote-hadoop.html">Remote Hadoop</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../cloud-runtimes/profiles/index.html"> Profiles</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../cloud-runtimes/profiles/creating-profiles.html">Creating Profiles</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../cloud-runtimes/profiles/assigning-profiles.html">Assigning Profiles</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../cloud-runtimes/profiles/admin-controls.html">Admin Controls</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../cloud-runtimes/example/index.html"> Example</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../security/index.html"> 安全</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../security/client-authentication.html">Client Authentication</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../security/cdap-authentication-clients-java.html">CDAP Authentication Client for Java</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../security/cdap-authentication-clients-python.html">CDAP Authentication Client for Python</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../security/custom-authentication.html">Custom Authentication</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../security/authorization-extensions.html">Authorization Extensions</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../testing/index.html"> 测试和调试</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../testing/testing.html"> Testing a CDAP Application</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../testing/debugging.html"> Debugging</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../testing/troubleshooting.html"> Troubleshooting</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../ingesting-tools/index.html"> 数据融合</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../ingesting-tools/cdap-stream-clients-java.html">CDAP Stream Client for Java</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../ingesting-tools/cdap-stream-clients-python.html">CDAP Stream Client for Python</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../ingesting-tools/cdap-stream-clients-ruby.html">CDAP Stream Client for Ruby</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../ingesting-tools/cdap-flume.html">CDAP Flume</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../data-exploration/index.html"> 数据探索</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../data-exploration/filesets.html"> Fileset Exploration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../data-exploration/tables.html"> Table Exploration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../data-exploration/object-mapped-tables.html"> ObjectMappedTable Exploration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../data-exploration/custom-datasets.html"> Custom Dataset Exploration</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../data-exploration/hive-execution-engines.html"> Hive Execution Engines</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../advanced/index.html"> 高级主题</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../advanced/application-logback.html"> Application Logback</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../advanced/best-practices.html"> Best Practices</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../advanced/class-loading.html"> Class Loading</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../advanced/configuring-resources.html"> Configuring Program Resources</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../advanced/program-retry-policies.html"> Program Retry Policies</a></li>
</ul>
</li>
</ul>
</nav>
      </li>
      <li class="toctree-l1"><a href="../../table-of-contents/../../admin-manual/index.html" rel="nofollow">管理手册</a>
      </li>
      <li class="toctree-l1"><a href="../../table-of-contents/../../integrations/index.html" rel="nofollow">集成手册</a>
      </li>
      <li class="toctree-l1"><a href="../../table-of-contents/../../examples-manual/index.html" rel="nofollow">最佳实践</a>
      </li>
    </ul>
  </div></div>
    </div><div class="col-md-8 content" id="main-content">
    
  <div class="section" id="fileset-dataset">
<span id="datasets-fileset"></span><h1>FileSet Dataset<a class="headerlink" href="#fileset-dataset" title="Permalink to this headline">🔗</a></h1>
<p>While real-time programs such as realtime Spark normally require datasets with random access, batch-oriented
programming paradigms such as MapReduce are more suitable for data that can be read and written sequentially.
The most prominent form of such data is an HDFS file, and MapReduce is highly optimized for such files.
CDAP’s abstraction for files is the <em>FileSet</em> dataset.</p>
<p>A <em>FileSet</em> represents a set of files on the file system that share certain properties:</p>
<ul class="simple">
<li>The location in the file system. All files in a FileSet are located relative to a
base path, which is created when the FileSet is created. Deleting the
FileSet will also delete this directory and all the files it contains.</li>
<li>The Hadoop input and output format. They are given as dataset properties by their
class names.  When a FileSet is used as the input or output of a MapReduce program,
these classes are injected into the Hadoop configuration by the CDAP runtime
system.</li>
<li>Additional properties of the specified input and output format. Each format has its own
properties; consult the format’s documentation for details. For example, the
<code class="docutils literal notranslate"><span class="pre">TextOutputFormat</span></code> allows configuring the field separator character by setting the
property <code class="docutils literal notranslate"><span class="pre">mapreduce.output.textoutputformat.separator</span></code>. These properties are also set
into the Hadoop configuration by the CDAP runtime system.</li>
</ul>
<p>These properties are configured at the time the FileSet is created. They apply to all
files in the dataset. Every time you use a FileSet in your application code, you can
address either the entire dataset or, by specifying its relative path as a runtime argument,
an individual file in the dataset. Specifying an individual file is only supported for
MapReduce programs.</p>
<div class="section" id="creating-a-fileset">
<h2>Creating a FileSet<a class="headerlink" href="#creating-a-fileset" title="Permalink to this headline">🔗</a></h2>
<p>To create and use a FileSet in an application, you create it as part of the application configuration:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="kd">public</span> <span class="kd">class</span> <span class="nc">FileSetExample</span> <span class="kd">extends</span> <span class="n">AbstractApplication</span> <span class="p">{</span>

  <span class="nd">@Override</span>
  <span class="kd">public</span> <span class="kt">void</span> <span class="nf">configure</span><span class="p">()</span> <span class="p">{</span>
    <span class="p">...</span>
    <span class="n">createDataset</span><span class="p">(</span><span class="s">&quot;lines&quot;</span><span class="p">,</span> <span class="n">FileSet</span><span class="p">.</span><span class="na">class</span><span class="p">,</span> <span class="n">FileSetProperties</span><span class="p">.</span><span class="na">builder</span><span class="p">()</span>
      <span class="p">.</span><span class="na">setBasePath</span><span class="p">(</span><span class="s">&quot;example/data/lines&quot;</span><span class="p">)</span>
      <span class="p">.</span><span class="na">setInputFormat</span><span class="p">(</span><span class="n">TextInputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
      <span class="p">.</span><span class="na">setOutputFormat</span><span class="p">(</span><span class="n">TextOutputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
      <span class="p">.</span><span class="na">setOutputProperty</span><span class="p">(</span><span class="n">TextOutputFormat</span><span class="p">.</span><span class="na">SEPERATOR</span><span class="p">,</span> <span class="s">&quot;:&quot;</span><span class="p">)</span>
      <span class="p">.</span><span class="na">build</span><span class="p">());</span>
    <span class="p">...</span>
  <span class="p">}</span>
</pre></div>
</div>
<p>This creates a new FileSet named <em>lines</em> that uses <code class="docutils literal notranslate"><span class="pre">TextInputFormat</span></code> and <code class="docutils literal notranslate"><span class="pre">TextOutputFormat.</span></code>
For the output format, we specify an additional property to make it use a colon as the separator
between the key and the value in each line of output.</p>
<p>Input and output formats must be implementations of the standard Apache Hadoop
<a class="reference external" href="https://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapreduce/InputFormat.html">InputFormat</a>
and
<a class="reference external" href="https://hadoop.apache.org/docs/current/api/org/apache/hadoop/mapreduce/OutputFormat.html">OutputFormat</a>
specifications. If you do not specify an input format, you will not be able to use this as the input for a
MapReduce program; similarly for the output format.</p>
<p>If you do not specify a base path, the dataset framework will generate a path based on the dataset name.
This path—and any relative base path you specify—is relative to the data directory of the CDAP namespace
in which the FileSet is created. You can also specify an absolute base path (one that begins with the character <code class="docutils literal notranslate"><span class="pre">/</span></code>).
This path is interpreted as an absolute path in the file system. Beware that if you create two FileSets with the
same base path—be it multiple FileSets in the same namespace with the same relative base path, or in different
namespaces with the same absolute base path—then these multiple FileSets will use the same directory and possibly
obstruct each other’s operations.</p>
<p>You can configure a FileSet as “external”. This means that the data (the actual files) in
the FileSet are managed by an external process. This allows you to use FileSets with
existing locations outside of CDAP. In that case, the FileSet will not allow the writing
or deleting of files: it treats the contents of the base path as read-only:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="n">createDataset</span><span class="p">(</span><span class="s">&quot;lines&quot;</span><span class="p">,</span> <span class="n">FileSet</span><span class="p">.</span><span class="na">class</span><span class="p">,</span> <span class="n">FileSetProperties</span><span class="p">.</span><span class="na">builder</span><span class="p">()</span>
  <span class="p">.</span><span class="na">setBasePath</span><span class="p">(</span><span class="s">&quot;/existing/path&quot;</span><span class="p">)</span>
  <span class="p">.</span><span class="na">setDataExternal</span><span class="p">(</span><span class="kc">true</span><span class="p">)</span>
  <span class="p">.</span><span class="na">setInputFormat</span><span class="p">(</span><span class="n">TextInputFormat</span><span class="p">.</span><span class="na">class</span><span class="p">)</span>
  <span class="p">...</span>
</pre></div>
</div>
<p>If you want to use an existing location and still be able to write to it, you have two options:</p>
<ul class="simple" id="datasets-fileset-reuse">
<li><code class="docutils literal notranslate"><span class="pre">setUseExisting(true)</span></code>: This directs the FileSet to accept an existing location as its base
path and an existing table in Hive for exploring. However, because the existing location may
contain files prior to the FileSet creation, the location and the Hive table will not be
deleted when the dataset is dropped, and truncating the FileSet will have no effect.
This is to ensure that no pre-existing data is deleted.</li>
<li><code class="docutils literal notranslate"><span class="pre">setPossessExisting(true)</span></code>: Similarly, this allows reuse of an existing location.
The FileSet will assume ownership of existing files in that location and of the Hive table,
which means that those files and the Hive table will be deleted when the dataset is dropped
or truncated.</li>
</ul>
</div>
<div class="section" id="using-a-fileset-in-mapreduce">
<h2>Using a FileSet in MapReduce<a class="headerlink" href="#using-a-fileset-in-mapreduce" title="Permalink to this headline">🔗</a></h2>
<p>Using a FileSet as input or output of a MapReduce program is the same as for any other dataset:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="kd">public</span> <span class="kd">class</span> <span class="nc">WordCount</span> <span class="kd">extends</span> <span class="n">AbstractMapReduce</span> <span class="p">{</span>

  <span class="nd">@Override</span>
  <span class="kd">public</span> <span class="kt">void</span> <span class="nf">initialize</span><span class="p">()</span> <span class="p">{</span>
    <span class="n">MapReduceContext</span> <span class="n">context</span> <span class="o">=</span> <span class="n">getContext</span><span class="p">();</span>
    <span class="n">context</span><span class="p">.</span><span class="na">addInput</span><span class="p">(</span><span class="n">Input</span><span class="p">.</span><span class="na">ofDataset</span><span class="p">(</span><span class="s">&quot;lines&quot;</span><span class="p">));</span>
    <span class="n">context</span><span class="p">.</span><span class="na">addOutput</span><span class="p">(</span><span class="n">Output</span><span class="p">.</span><span class="na">ofDataset</span><span class="p">(</span><span class="s">&quot;counts&quot;</span><span class="p">));</span>
    <span class="p">...</span>
  <span class="p">}</span>
</pre></div>
</div>
<p>The MapReduce program only needs to specify the names of the input and output datasets.
Whether they are FileSets or another type of dataset is handled by the CDAP runtime system.</p>
<p>However, you do need to tell CDAP the relative paths of the input and output files. Currently,
this is only possible by specifying them as runtime arguments when the MapReduce program is started:</p>

<script type="text/javascript">

  $(function tabbedparsedliteral5() {
    var tabs = ['linux', 'windows'];
    var mapping = {'windows': 'windows', 'linux': 'linux'};
    var tabSetID = 'linux-windows';
    for (var i = 0; i < tabs.length; i++) {
      var tab = tabs[i];
      $("#tabbedparsedliteral5 .example-tab-" + tab).click(changeExampleTab(tab, mapping, "tabbedparsedliteral5", tabSetID));
    }
  });

</script>
<div id="tabbedparsedliteral5" class="tabbed-parsed-literal dependent-linux-windows">
<ul class="tabbed-parsed-literal nav-tabs">
<li class="example-tab example-tab-linux active"><a href="#">Linux</a></li>
<li class="example-tab example-tab-windows "><a href="#">Windows</a></li>
</ul>

<div class="tab-contents">

<div class="tab-pane tab-pane-linux active">
<div class="code code-tab">
<div class="highlight-console">
<!-- tabbed-parsed-literal start -->
<div class="highlight"><pre><span></span><span class="gp">$</span> curl -w<span class="s2">&quot;\n&quot;</span> -X POST <span class="s2">&quot;http://example.com:11015/v3/namespaces/default/apps/FileSetExample/mapreduce/WordCount/start&quot;</span> <span class="se">\</span>
-d <span class="s1">&#39;{ &quot;dataset.lines.input.paths&quot;: &quot;monday/my.txt&quot;, &quot;dataset.counts.output.path&quot;: &quot;monday/counts.out&quot; }&#39;</span></span>
</pre></div>
<!-- tabbed-parsed-literal end --></div>
</div>
</div>
<div class="tab-pane tab-pane-windows ">
<div class="code code-tab">
<div class="highlight-shell-session">
<!-- tabbed-parsed-literal start -->
<div class="highlight"><pre><span></span><span class="gp">&gt;</span> curl -X POST <span class="s2">&quot;http://example.com:11015/v3/namespaces/default/apps/FileSetExample/mapreduce/WordCount/start&quot;</span> ^
<span class="go">-d &quot;{ \&quot;dataset.lines.input.paths\&quot;: \&quot;monday/my.txt\&quot;, \&quot;dataset.counts.output.path\&quot;: \&quot;monday/counts.out\&quot; }&quot;</span>
</pre></div>
<!-- tabbed-parsed-literal end --></div>
</div>
</div>
</div>
</div>
<p>Using the CDAP CLI:</p>

<script type="text/javascript">

  function change_tabbedparsedliteral6_ExampleTab(tab) {
    return function(e) {
      e.preventDefault();
      var scrollOffset = $(this).offset().top - $(document).scrollTop();
      $("#tabbedparsedliteral6 .tab-pane").removeClass("active");
      $("#tabbedparsedliteral6 .tab-pane-" + tab).addClass("active");
      $("#tabbedparsedliteral6 .example-tab").removeClass("active");
      $("#tabbedparsedliteral6 .example-tab-" + tab).addClass("active");
      $(document).scrollTop($(this).offset().top - scrollOffset);
    }
  }

  $(function() {
    var tabs = ['cdap-cli'];
    for (var i = 0; i < tabs.length; i++) {
      var tab = tabs[i];
      $("#tabbedparsedliteral6 .example-tab-" + tab).click(change_tabbedparsedliteral6_ExampleTab(tab));
    }
  });

</script>
<div id="tabbedparsedliteral6" class="tabbed-parsed-literal independent">
<ul class="tabbed-parsed-literal nav-tabs">
<li class="example-tab example-tab-cdap-cli active"><a href="#">CDAP CLI</a></li>
</ul>

<div class="tab-contents">

<div class="tab-pane tab-pane-cdap-cli active">
<div class="code code-tab">
<div class="highlight-shell-session">
<!-- tabbed-parsed-literal start -->
<div class="highlight"><pre><span></span><span class="go">cdap &gt; start mapreduce FileSetExample.WordCount &quot;dataset.lines.input.paths=monday/my.txt dataset.counts.output.path=monday/counts.out&quot;</span>
</pre></div>
<!-- tabbed-parsed-literal end --></div>
</div>
</div>
</div>
</div>
<p>Note that for the input you can specify multiple paths separated by commas:</p>
<div class="highlight-console notranslate"><div class="highlight"><pre><span></span><span class="go">&quot;dataset.lines.input.paths&quot;: &quot;monday/lines.txt,tuesday/lines.txt&quot;</span>
</pre></div>
</div>
<p>If you do not specify both the input and output paths, your MapReduce program will fail with an error.</p>
</div>
<div class="section" id="using-a-fileset-programmatically">
<h2>Using a FileSet Programmatically<a class="headerlink" href="#using-a-fileset-programmatically" title="Permalink to this headline">🔗</a></h2>
<p>You can interact with the files of a FileSet directly, through the <code class="docutils literal notranslate"><span class="pre">Location</span></code> abstraction
of the file system. For example, a Service can use a FileSet by declaring it with a <code class="docutils literal notranslate"><span class="pre">&#64;UseDataSet</span></code>
annotation, and then obtaining a <code class="docutils literal notranslate"><span class="pre">Location</span></code> for a relative path within the FileSet:</p>
<div class="highlight-java notranslate"><div class="highlight"><pre><span></span><span class="nd">@UseDataSet</span><span class="p">(</span><span class="s">&quot;lines&quot;</span><span class="p">)</span>
<span class="kd">private</span> <span class="n">FileSet</span> <span class="n">lines</span><span class="p">;</span>

<span class="nd">@GET</span>
<span class="nd">@Path</span><span class="p">(</span><span class="s">&quot;{fileSet}&quot;</span><span class="p">)</span>
<span class="kd">public</span> <span class="kt">void</span> <span class="nf">read</span><span class="p">(</span><span class="n">HttpServiceRequest</span> <span class="n">request</span><span class="p">,</span> <span class="n">HttpServiceResponder</span> <span class="n">responder</span><span class="p">,</span>
                 <span class="nd">@QueryParam</span><span class="p">(</span><span class="s">&quot;path&quot;</span><span class="p">)</span> <span class="n">String</span> <span class="n">filePath</span><span class="p">)</span> <span class="p">{</span>

  <span class="n">Location</span> <span class="n">location</span> <span class="o">=</span> <span class="n">lines</span><span class="p">.</span><span class="na">getLocation</span><span class="p">(</span><span class="n">filePath</span><span class="p">);</span>
  <span class="k">try</span> <span class="p">{</span>
    <span class="n">InputStream</span> <span class="n">inputStream</span> <span class="o">=</span> <span class="n">location</span><span class="p">.</span><span class="na">getInputStream</span><span class="p">();</span>
    <span class="p">...</span>
  <span class="p">}</span> <span class="k">catch</span> <span class="p">(</span><span class="n">IOException</span> <span class="n">e</span><span class="p">)</span> <span class="p">{</span>
    <span class="p">...</span>
  <span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
<p>See the Apache™ Twill®
<a class="reference external" href="http://twill.apache.org/apidocs/org/apache/twill/filesystem/Location.html">API documentation</a>
for additional information about the <code class="docutils literal notranslate"><span class="pre">Location</span></code> abstraction.</p>
</div>
<div class="section" id="exploring-filesets">
<h2>Exploring FileSets<a class="headerlink" href="#exploring-filesets" title="Permalink to this headline">🔗</a></h2>
<p>A file set can be explored with ad-hoc queries if you enable it at creation time;
this is described under <a class="reference internal" href="../../data-exploration/filesets.html#fileset-exploration"><span class="std std-ref">FileSet Exploration</span></a>.</p>
</div>
</div>

</div>
    <div class="col-md-2">
      <div id="right-sidebar" class="bs-sidenav scrollable-y" role="complementary">
        <div id="localtoc-scrollspy">
        </div>
      </div>
    </div></div>
</div>
<!-- block main content end -->
<!-- block footer -->
<footer class="footer">
      <div class="container">
        <div class="row">
          <div class="col-md-2 footer-left"><a title="Tables" href="table.html" />Previous</a></div>
          <div class="col-md-8 footer-center"><a class="footer-tab-link" href="../../table-of-contents/../../reference-manual/licenses/index.html">Copyright</a> &copy; 2014-2020 Cask Data, Inc.&bull; <a class="footer-tab-link" href="//docs.cask.co/cdap/6.1.1/cdap-docs-6.1.1-web.zip" rel="nofollow">Download</a> an archive or
<a class="footer-tab-link" href="//docs.cask.co/cdap">switch the version</a> of the documentation
          </div>
          <div class="col-md-2 footer-right"><a title="Partitioned FileSet" href="partitioned-fileset.html" />Next</a></div>
        </div>
      </div>
    </footer>
<!-- block footer end -->
<script type="text/javascript" src="../../_static/bootstrap-3.3.6/js/bootstrap.min.js"></script><script type="text/javascript" src="../../_static/js/bootstrap-sphinx.js"></script><script type="text/javascript" src="../../_static/js/abixTreeList-2.js"></script><script type="text/javascript" src="../../_static/js/cdap-dynamicscrollspy-4.js"></script><script type="text/javascript" src="../../_static/js/cdap-version-menu.js"></script><script type="text/javascript" src="../../_static/js/copy-to-clipboard.js"></script><script type="text/javascript" src="../../_static/js/jquery.mousewheel.min.js"></script><script type="text/javascript" src="../../_static/js/jquery.mCustomScrollbar.js"></script><script type="text/javascript" src="../../_static/js/js.cookie.js"></script><script type="text/javascript" src="../../_static/js/tabbed-parsed-literal-0.2.js"></script><script type="text/javascript" src="../../_static/js/cdap-onload-javascript.js"></script><script type="text/javascript" src="../../_static/js/cdap-version-menu.js"></script>
    <script src="https://cdap.gitee.io/docs/cdap/json-versions.js"/></script>
  </body>
</html>